package cn.edu.scut.suggestion.corpus;

/**
 * 将已有的解析文件转化为JgibbLda所需要的输入文件格式
 * @author tian.yuchen
 */
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;

import cn.edu.scut.suggestion.segmentation.Segment;

public class LDAInput {
	
	
	public static void main(String[] args) throws IOException {
		int defaultNumTerms=100;
		
		String path = "/home/tian.yuchen/data/uestc/parse/1-6/3";
		String outpathString="/home/tian.yuchen/querysuggestion_data/ldainput";
		
		File rootfolder = new File(path);
		File ldaInFile = new File(outpathString);
		
		FileOutputStream fos=new FileOutputStream(ldaInFile);
		OutputStreamWriter osw=new OutputStreamWriter(fos);
		visitFile(rootfolder,osw);
	}
	

	public static void visitFile(File file,OutputStreamWriter osw) throws IOException{
		if (file.isDirectory()){
			File[] files=file.listFiles();
			for (File file2 : files) {
				visitFile(file2,osw);
			}
		}else {
			Segment sg=new Segment();
			FileInputStream fis=new FileInputStream(file);
			InputStreamReader isr=new InputStreamReader(fis, "UTF-8");
			BufferedReader br=new BufferedReader(isr);
			String str=null;
			String content="";
			int count=0;
			while((str=br.readLine())!=null){
				//System.out.println(str);
				if(str.startsWith("Connection")) {
					count++;
					content+="\n";
					if(!content.equals("\n")){
						osw.write(content);
					}
					content="";
				}else{
					String newstr=sg.splitToString(str);
					content+=newstr;
				}
				
			}
			System.out.println(count);
		//	String result=sg.splitToString("佛教禅宗参悟，是教修行人放下万缘，放下一切佛教理论，力图“悟自本心");
		//	System.out.println(content);
			System.out.println("over");
		}
	}

}
